Read in and clean up datafiles

Read in classification outputs (from get_classifications.py)

meta <- read.csv('classification-outputs/museumstation_subset_classification.csv') %>%
  as.tibble() %>%
  mutate(session_id = paste('cdm_',session_ids,sep="")) %>%
  mutate(age_numeric = ages) %>%
  mutate(age = paste('age',ages,sep="")) %>%
  mutate(category = target_classes) %>% 
  mutate(category_short = str_split_fixed(category," ",2)[,2]) %>%
  mutate(image_name = paste(category,'_sketch_', age,'_', session_id,'.png',sep="")) %>%
  select(-X) 
## Warning: package 'bindrcpp' was built under R version 3.4.4

Merge with mongodb database outputs (stroke count, duration, etc.)

Now should be able to look at target label probabilties for each image as a function of all of these other factors

d <- read.csv('mongodb-output/MuseumStation_AllDescriptives_20780_images_final_cdm_run_v3.csv') %>%
  as.tibble() %>%
  left_join(meta) %>% # should join on session_id, category, age -- combination of which is unique identifier for an image
  filter(!is.na(target_label_prob)) # if intermediate file, only look where we have data
## Joining, by = c("session_id", "category", "age")
## Warning: Column `session_id` joining factor and character vector, coercing
## into character vector
## Warning: Column `category` joining factors with different levels, coercing
## to character vector
## Warning: Column `age` joining factor and character vector, coercing into
## character vector
## we only have meta data for those in cdm_run_v3, and we ran some classifications on cdm_run_v2 -- so mismatch here.
# test_meta <- meta %>%
#   filter(!(session_id %in% mongodb_meta$session_id)) 

Plot data before averaging to get a sense of the distributions

Target label probs for each category; dots colored by whether it was correctly classified

ggplot(d, aes(age_numeric,target_label_prob, col=image_scores)) +
  theme_few() + 
  geom_jitter(alpha=.2, height=0, width=.3) +
  geom_smooth(span=10, col='dark grey') + 
  # scale_color_viridis(option="B") + 
  theme(legend.position = "none") + 
  facet_wrap(~category) +
  labs(y = 'Correct label probability', x = 'Age')
## `geom_smooth()` using method = 'loess'

Plot number of strokes x probability of target class

ggplot(d, aes(num_strokes,log(target_label_prob), color=image_scores)) +
  geom_point(alpha=.5) +
  theme_few() + 
  geom_smooth(method='lm') +
  xlim(c(0,50)) + 
  facet_wrap(~category)
## Warning: Removed 67 rows containing non-finite values (stat_smooth).
## Warning: Removed 67 rows containing missing values (geom_point).

Plot mean intensity x probability of target class

ggplot(d, aes(mean_intensity,target_label_prob, color=image_scores)) +
  geom_point(alpha=.5) +
  theme_few() + 
  geom_smooth(method='lm') +
  facet_wrap(~category)

Plot draw duration x probability of target class

ggplot(d, aes(draw_duration_old,target_label_prob, color=image_scores, names = "recognized or not")) +
  geom_point(alpha=.5) +
  theme_few() + 
  geom_smooth(method='lm') +
  facet_wrap(~category) +
  labs(x='Time spent drawing', y='Log probability', color="correctly classified?")

Intensity vs. strokes by age

ggplot(d, aes(mean_intensity,num_strokes)) +
  geom_point(alpha=.5) +
  theme_few() + 
  geom_smooth(method='lm') +
  facet_wrap(~age_numeric) +
  ylim(c(0,50)) + ## some crazy outliers here, probably scribbles we didn't catch?
  labs(x='Ink', y='Num strokes')
## Warning: Removed 67 rows containing non-finite values (stat_smooth).
## Warning: Removed 67 rows containing missing values (geom_point).

Inferential stats

Generalized linear mixed models on accuracy all covariates

mod_covariates <- glmer(image_scores ~ scale(age_numeric) +
                          scale(draw_duration_old) +
                          scale(mean_intensity) +
                          scale(num_strokes) +
                        (1|session_id) +
                        (1|category),
      data = d,
      family = "binomial")

kable(summary(mod_covariates)$coef, digits = 3)
Estimate Std. Error z value Pr(>|z|)
(Intercept) -0.188 0.219 -0.857 0.391
scale(age_numeric) 0.530 0.039 13.440 0.000
scale(draw_duration_old) 0.211 0.043 4.927 0.000
scale(mean_intensity) -0.059 0.041 -1.444 0.149
scale(num_strokes) -0.284 0.088 -3.233 0.001

Visualize predictions

predicted_df = data.frame(glmer_predictions = predict(mod_covariates, d), age=d$age_numeric, category=d$category)

(predicted_accuracy <- ggplot(data = predicted_df, aes(x=age, y=glmer_predictions, col=age)) +
  scale_color_viridis()  +
  theme_few() + 
  geom_jitter(alpha=.1, width=.2, height=0) + 
  stat_smooth(col='grey', method='lm') +
  theme(legend.position = "none") + 
  scale_x_discrete(limits=c(2,3,4,5,6,7,8,9,10)) +
  labs(x = 'Age', y = 'Predicted accuracy of actual category'))

ggsave('plots-nov26/predicted_accuracy.png',predicted_accuracy, width = 6, height = 4 )

Generalized linear mixed model on category probabilities with all covariates

mod_covariates_prob <- glmer(target_label_prob ~ scale(age_numeric) +
                          scale(draw_duration_old) +
                          scale(mean_intensity) +
                          scale(num_strokes) +
                        (1|session_id) +
                        (1|category),
      data = d)
## Warning in glmer(target_label_prob ~ scale(age_numeric) +
## scale(draw_duration_old) + : calling glmer() with family=gaussian (identity
## link) as a shortcut to lmer() is deprecated; please call lmer() directly
kable(summary(mod_covariates_prob)$coef, digits = 3)
Estimate Std. Error t value
(Intercept) 0.090 0.003 31.212
scale(age_numeric) 0.007 0.000 17.255
scale(draw_duration_old) 0.003 0.000 7.221
scale(mean_intensity) -0.002 0.000 -5.368
scale(num_strokes) -0.001 0.000 -1.729

Visualize predictions by age

predicted_df = data.frame(glmer_predictions_prob = predict(mod_covariates_prob, d), age=d$age_numeric, category=d$category)

## Model prediction lines overlaid on raw data fed to the model
(predicted_probs <- ggplot(data = predicted_df, aes(x=age, y=glmer_predictions_prob, col=age)) +
  scale_color_viridis()  +
  theme_few() + 
  geom_jitter(alpha=.1, width=.2, height=0) + 
  stat_smooth(col='grey', method='lm') +
  theme(legend.position = "none") + 
  scale_x_discrete(limits=c(2,3,4,5,6,7,8,9,10)) +
  labs(x = 'Age', y = 'Predicted probability of actual category'))

ggsave('plots-nov26/predicted_probs.png',predicted_probs, width = 6, height = 4 )

Run model on probabilities only when it was classified correctly

E.g., is there an increase in confidence independent of an increase in accuracy of the classifier?

d_correct <- d %>%
  filter(image_scores == 1)

mod_covariates_correct_only <- glmer(target_label_prob ~ scale(age_numeric) +
                          scale(draw_duration_old) +
                          scale(mean_intensity) +
                          scale(num_strokes) +
                        (1|session_id) +
                        (1|category),
      data = d_correct)
## Warning in glmer(target_label_prob ~ scale(age_numeric) +
## scale(draw_duration_old) + : calling glmer() with family=gaussian (identity
## link) as a shortcut to lmer() is deprecated; please call lmer() directly
modelOut=summary(mod_covariates_correct_only)
kable(summary(mod_covariates_correct_only)$coef, digits = 3)
Estimate Std. Error t value
(Intercept) 0.109 0.003 34.680
scale(age_numeric) 0.003 0.000 8.271
scale(draw_duration_old) 0.001 0.000 2.923
scale(mean_intensity) -0.002 0.000 -5.347
scale(num_strokes) -0.001 0.000 -2.209

Render out subsets of classifications by classification scores

Set parameters

### Set parameters first
categories = unique(meta$category)
##
upper_thresholds=c(1,.85,.65,.45,.25,.05)
lower_thresholds=c(.95,.80,.60,.40,.20,0)
##
age_thres = 0
dir_name = 'subset_classification_examples_test'
dir.create(dir_name)
## Warning in dir.create(dir_name): 'subset_classification_examples_test'
## already exists

Render out random sample with these parameters

##
for (this_category in categories){
  dir.create(file.path(paste(dir_name,'/',this_category,sep="")))
  thres_count=0
  
  for (upper in upper_thresholds) {
    thres_count = thres_count + 1
    lower = lower_thresholds[thres_count]
    subset <- meta %>%
      filter(age_numeric > age_thres) %>%
      group_by(category) %>%
      mutate(upper_thres = quantile(target_label_prob, upper, na.rm = TRUE)) %>%
      mutate(lower_thres = quantile(target_label_prob, lower, na.rm = TRUE)) %>%
      filter(category == this_category) %>%
      filter(target_label_prob > lower_thres & target_label_prob < upper_thres) %>%
      sample_n(2) %>%
      mutate(image_path = paste('srcd-features/museumstation_sketches/',category,'/',image_name,sep="")) %>%
      mutate(new_image_path = paste(dir_name,'/',this_category,'/',
                                    round(target_label_prob,4),image_name,sep=""))
    
    file.copy(subset$image_path, subset$new_image_path)
  }
}

### Make montages of these randomly sampled sketches for use in diagrams
dir.create(paste0(dir_name,'/montages/'))
for (this_category in categories){
  image_read(dir(paste(dir_name, "/",this_category,sep=""), full.names = TRUE)) %>%
  image_append(stack = FALSE) %>%
    image_write(file.path(paste0(dir_name,"/montages/", this_category,".png")))
}

Correct only

##
age_thres = 4
dir_name = 'subset_classification_examples_ages4_10_correct_only'
dir.create(dir_name)

##
for (this_category in categories){
  dir.create(file.path(paste(dir_name,'/',this_category,sep="")))
  thres_count=0
  
  for (upper in upper_thresholds) {
    thres_count = thres_count + 1
    lower = lower_thresholds[thres_count]
    subset <- meta %>%
      filter(age_numeric > age_thres) %>%
      filter(image_scores==1 ) %>%
      group_by(category) %>%
      mutate(upper_thres = quantile(target_label_prob, upper, na.rm = TRUE)) %>%
      mutate(lower_thres = quantile(target_label_prob, lower, na.rm = TRUE)) %>%
      filter(category == this_category) %>%
      filter(target_label_prob > lower_thres & target_label_prob < upper_thres) %>%
      sample_n(2) %>%
      mutate(image_path = paste('srcd-features/museumstation_sketches/',category,'/',image_name,sep="")) %>%
      mutate(new_image_path = paste(dir_name,'/',this_category,'/',
                                    round(target_label_prob,4),image_name,sep=""))
    
    file.copy(subset$image_path, subset$new_image_path)
  }
}

### Make montages of these randomly sampled sketches for use in diagrams
dir.create(paste0(dir_name,'/montages/'))
for (this_category in categories){
  image_read(dir(paste(dir_name, "/",this_category,sep=""), full.names = TRUE)) %>%
  image_append(stack = FALSE) %>%
    image_write(file.path(paste0(dir_name,"/montages/", this_category,".png")))
}